import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
from pandas.plotting import scatter_matrix
df = pd.read_excel("C:/Users/BharathThatipamula/Desktop/wine/Wine_data.xlsx")
df.head(5)
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.0 | 0.27 | 0.36 | 20.7 | 0.045 | 45.0 | 170.0 | 1.0010 | 3.00 | 0.45 | 8.8 | 6 |
| 1 | 6.3 | 0.30 | 0.34 | 1.6 | 0.049 | 14.0 | 132.0 | 0.9940 | 3.30 | 0.49 | 9.5 | 6 |
| 2 | 8.1 | 0.28 | 0.40 | 6.9 | 0.050 | 30.0 | 97.0 | 0.9951 | 3.26 | 0.44 | 10.1 | 6 |
| 3 | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47.0 | 186.0 | 0.9956 | 3.19 | 0.40 | 9.9 | 6 |
| 4 | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47.0 | 186.0 | 0.9956 | 3.19 | 0.40 | 9.9 | 6 |
df.isnull().sum()
fixed acidity 0 volatile acidity 0 citric acid 0 residual sugar 0 chlorides 0 free sulfur dioxide 0 total sulfur dioxide 0 density 0 pH 0 sulphates 0 alcohol 0 quality 0 dtype: int64
df.describe()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 |
| mean | 6.854788 | 0.278241 | 0.334192 | 6.391415 | 0.045772 | 35.308085 | 138.360657 | 0.994027 | 3.188267 | 0.489847 | 10.514267 | 5.877909 |
| std | 0.843868 | 0.100795 | 0.121020 | 5.072058 | 0.021848 | 17.007137 | 42.498065 | 0.002991 | 0.151001 | 0.114126 | 1.230621 | 0.885639 |
| min | 3.800000 | 0.080000 | 0.000000 | 0.600000 | 0.009000 | 2.000000 | 9.000000 | 0.987110 | 2.720000 | 0.220000 | 8.000000 | 3.000000 |
| 25% | 6.300000 | 0.210000 | 0.270000 | 1.700000 | 0.036000 | 23.000000 | 108.000000 | 0.991723 | 3.090000 | 0.410000 | 9.500000 | 5.000000 |
| 50% | 6.800000 | 0.260000 | 0.320000 | 5.200000 | 0.043000 | 34.000000 | 134.000000 | 0.993740 | 3.180000 | 0.470000 | 10.400000 | 6.000000 |
| 75% | 7.300000 | 0.320000 | 0.390000 | 9.900000 | 0.050000 | 46.000000 | 167.000000 | 0.996100 | 3.280000 | 0.550000 | 11.400000 | 6.000000 |
| max | 14.200000 | 1.100000 | 1.660000 | 65.800000 | 0.346000 | 289.000000 | 440.000000 | 1.038980 | 3.820000 | 1.080000 | 14.200000 | 9.000000 |
df.dtypes
fixed acidity float64 volatile acidity float64 citric acid float64 residual sugar float64 chlorides float64 free sulfur dioxide float64 total sulfur dioxide float64 density float64 pH float64 sulphates float64 alcohol float64 quality int64 dtype: object
# Plot histograms for specified columns
plt.figure(figsize=(12, 8))
plt.subplot(3, 2, 1)
plt.hist(df['fixed acidity'], bins=30, color='slategray', edgecolor='black')
plt.title('Fixed Acidity')
plt.subplot(3, 2, 2)
plt.hist(df['volatile acidity'], bins=30, color='slategray', edgecolor='black')
plt.title('Volatile Acidity')
plt.subplot(3, 2, 3)
plt.hist(df['citric acid'], bins=30, color='slategray', edgecolor='black')
plt.title('Citric Acid')
plt.subplot(3, 2, 4)
plt.hist(df['residual sugar'], bins=30, color='slategray', edgecolor='black')
plt.title('Residual Sugar')
plt.subplot(3, 2, 5)
plt.hist(df['chlorides'], bins=30, color='slategray', edgecolor='black')
plt.title('Chlorides')
plt.tight_layout()
plt.show()
# Plot box plots for specified columns
plt.figure(figsize=(12, 8))
plt.subplot(3, 2, 1)
sns.boxplot(x='quality', y='fixed acidity', data=df, color='slategray')
plt.title('Fixed Acidity')
plt.subplot(3, 2, 2)
sns.boxplot(x='quality', y='volatile acidity', data=df, color='slategray')
plt.title('Volatile Acidity')
plt.subplot(3, 2, 3)
sns.boxplot(x='quality', y='citric acid', data=df, color='slategray')
plt.title('Citric Acid')
plt.subplot(3, 2, 4)
sns.boxplot(x='quality', y='residual sugar', data=df, color='slategray')
plt.title('Residual Sugar')
plt.subplot(3, 2, 5)
sns.boxplot(x='quality', y='chlorides', data=df, color='slategray')
plt.title('Chlorides')
plt.tight_layout()
plt.show()
# Summary statistics
summary = df.describe()
print(summary)
fixed acidity volatile acidity citric acid residual sugar \
count 4898.000000 4898.000000 4898.000000 4898.000000
mean 6.854788 0.278241 0.334192 6.391415
std 0.843868 0.100795 0.121020 5.072058
min 3.800000 0.080000 0.000000 0.600000
25% 6.300000 0.210000 0.270000 1.700000
50% 6.800000 0.260000 0.320000 5.200000
75% 7.300000 0.320000 0.390000 9.900000
max 14.200000 1.100000 1.660000 65.800000
chlorides free sulfur dioxide total sulfur dioxide density \
count 4898.000000 4898.000000 4898.000000 4898.000000
mean 0.045772 35.308085 138.360657 0.994027
std 0.021848 17.007137 42.498065 0.002991
min 0.009000 2.000000 9.000000 0.987110
25% 0.036000 23.000000 108.000000 0.991723
50% 0.043000 34.000000 134.000000 0.993740
75% 0.050000 46.000000 167.000000 0.996100
max 0.346000 289.000000 440.000000 1.038980
pH sulphates alcohol quality
count 4898.000000 4898.000000 4898.000000 4898.000000
mean 3.188267 0.489847 10.514267 5.877909
std 0.151001 0.114126 1.230621 0.885639
min 2.720000 0.220000 8.000000 3.000000
25% 3.090000 0.410000 9.500000 5.000000
50% 3.180000 0.470000 10.400000 6.000000
75% 3.280000 0.550000 11.400000 6.000000
max 3.820000 1.080000 14.200000 9.000000
# Correlation matrix
correlation_matrix = df.iloc[:, :-1].corr()
print(correlation_matrix)
df.head()
fixed acidity volatile acidity citric acid \
fixed acidity 1.000000 -0.022697 0.289181
volatile acidity -0.022697 1.000000 -0.149472
citric acid 0.289181 -0.149472 1.000000
residual sugar 0.089021 0.064286 0.094212
chlorides 0.023086 0.070512 0.114364
free sulfur dioxide -0.049396 -0.097012 0.094077
total sulfur dioxide 0.091070 0.089261 0.121131
density 0.265331 0.027114 0.149503
pH -0.425858 -0.031915 -0.163748
sulphates -0.017143 -0.035728 0.062331
alcohol -0.120881 0.067718 -0.075729
residual sugar chlorides free sulfur dioxide \
fixed acidity 0.089021 0.023086 -0.049396
volatile acidity 0.064286 0.070512 -0.097012
citric acid 0.094212 0.114364 0.094077
residual sugar 1.000000 0.088685 0.299098
chlorides 0.088685 1.000000 0.101392
free sulfur dioxide 0.299098 0.101392 1.000000
total sulfur dioxide 0.401439 0.198910 0.615501
density 0.838966 0.257211 0.294210
pH -0.194133 -0.090439 -0.000618
sulphates -0.026664 0.016763 0.059217
alcohol -0.450631 -0.360189 -0.250104
total sulfur dioxide density pH sulphates \
fixed acidity 0.091070 0.265331 -0.425858 -0.017143
volatile acidity 0.089261 0.027114 -0.031915 -0.035728
citric acid 0.121131 0.149503 -0.163748 0.062331
residual sugar 0.401439 0.838966 -0.194133 -0.026664
chlorides 0.198910 0.257211 -0.090439 0.016763
free sulfur dioxide 0.615501 0.294210 -0.000618 0.059217
total sulfur dioxide 1.000000 0.529881 0.002321 0.134562
density 0.529881 1.000000 -0.093591 0.074493
pH 0.002321 -0.093591 1.000000 0.155951
sulphates 0.134562 0.074493 0.155951 1.000000
alcohol -0.448892 -0.780138 0.121432 -0.017433
alcohol
fixed acidity -0.120881
volatile acidity 0.067718
citric acid -0.075729
residual sugar -0.450631
chlorides -0.360189
free sulfur dioxide -0.250104
total sulfur dioxide -0.448892
density -0.780138
pH 0.121432
sulphates -0.017433
alcohol 1.000000
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.0 | 0.27 | 0.36 | 20.7 | 0.045 | 45.0 | 170.0 | 1.0010 | 3.00 | 0.45 | 8.8 | 6 |
| 1 | 6.3 | 0.30 | 0.34 | 1.6 | 0.049 | 14.0 | 132.0 | 0.9940 | 3.30 | 0.49 | 9.5 | 6 |
| 2 | 8.1 | 0.28 | 0.40 | 6.9 | 0.050 | 30.0 | 97.0 | 0.9951 | 3.26 | 0.44 | 10.1 | 6 |
| 3 | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47.0 | 186.0 | 0.9956 | 3.19 | 0.40 | 9.9 | 6 |
| 4 | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47.0 | 186.0 | 0.9956 | 3.19 | 0.40 | 9.9 | 6 |
# Spearman correlation matrix
spearman_correlation_matrix = df.iloc[:, :-1].corr(method='spearman')
print(spearman_correlation_matrix)
df.head()
fixed acidity volatile acidity citric acid \
fixed acidity 1.000000 -0.042865 0.297878
volatile acidity -0.042865 1.000000 -0.150410
citric acid 0.297878 -0.150410 1.000000
residual sugar 0.106725 0.108627 0.024621
chlorides 0.094691 -0.004934 0.032659
free sulfur dioxide -0.024542 -0.081213 0.088314
total sulfur dioxide 0.112649 0.117614 0.093219
density 0.270031 0.010124 0.091425
pH -0.418341 -0.045204 -0.146193
sulphates -0.013238 -0.016902 0.079766
alcohol -0.106827 0.033967 -0.029170
residual sugar chlorides free sulfur dioxide \
fixed acidity 0.106725 0.094691 -0.024542
volatile acidity 0.108627 -0.004934 -0.081213
citric acid 0.024621 0.032659 0.088314
residual sugar 1.000000 0.227844 0.346107
chlorides 0.227844 1.000000 0.167046
free sulfur dioxide 0.346107 0.167046 1.000000
total sulfur dioxide 0.431252 0.375244 0.618616
density 0.780365 0.508302 0.327822
pH -0.180028 -0.054006 -0.006274
sulphates -0.003844 0.093931 0.052252
alcohol -0.445257 -0.570806 -0.272569
total sulfur dioxide density pH sulphates \
fixed acidity 0.112649 0.270031 -0.418341 -0.013238
volatile acidity 0.117614 0.010124 -0.045204 -0.016902
citric acid 0.093219 0.091425 -0.146193 0.079766
residual sugar 0.431252 0.780365 -0.180028 -0.003844
chlorides 0.375244 0.508302 -0.054006 0.093931
free sulfur dioxide 0.618616 0.327822 -0.006274 0.052252
total sulfur dioxide 1.000000 0.563824 -0.011829 0.157825
density 0.563824 1.000000 -0.110061 0.095079
pH -0.011829 -0.110061 1.000000 0.140243
sulphates 0.157825 0.095079 0.140243 1.000000
alcohol -0.476619 -0.821855 0.148857 -0.044868
alcohol
fixed acidity -0.106827
volatile acidity 0.033967
citric acid -0.029170
residual sugar -0.445257
chlorides -0.570806
free sulfur dioxide -0.272569
total sulfur dioxide -0.476619
density -0.821855
pH 0.148857
sulphates -0.044868
alcohol 1.000000
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.0 | 0.27 | 0.36 | 20.7 | 0.045 | 45.0 | 170.0 | 1.0010 | 3.00 | 0.45 | 8.8 | 6 |
| 1 | 6.3 | 0.30 | 0.34 | 1.6 | 0.049 | 14.0 | 132.0 | 0.9940 | 3.30 | 0.49 | 9.5 | 6 |
| 2 | 8.1 | 0.28 | 0.40 | 6.9 | 0.050 | 30.0 | 97.0 | 0.9951 | 3.26 | 0.44 | 10.1 | 6 |
| 3 | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47.0 | 186.0 | 0.9956 | 3.19 | 0.40 | 9.9 | 6 |
| 4 | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47.0 | 186.0 | 0.9956 | 3.19 | 0.40 | 9.9 | 6 |
with sns.plotting_context("notebook", font_scale=1.2):
scatter_matrix(df.iloc[:, :-1], alpha=0.7, figsize=(20, 20), diagonal='kde', marker='o', color='red')
plt.suptitle("Scatterplot of Chemical Attributes", y=1.02, fontsize=10)
plt.show()
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# Preparing data for regression
X = df.drop('quality', axis=1) # Features
y = df['quality'] # Target
# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Multiple Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)
# Predictions and model evaluation
predictions = model.predict(X_test)
print('Mean Squared Error:', mean_squared_error(y_test, predictions))
Mean Squared Error: 0.5690247717275313
import seaborn as sns
import matplotlib.pyplot as plt
# Calculate the correlation matrix
correlation_matrix = df.corr()
# Create a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix Heatmap')
plt.show()
from sklearn.preprocessing import PolynomialFeatures
# Creating polynomial features
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)
# Splitting the data
X_train_poly, X_test_poly, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)
# Polynomial Regression
model_poly = LinearRegression()
model_poly.fit(X_train_poly, y_train)
# Predictions and evaluation
predictions_poly = model_poly.predict(X_test_poly)
print('Mean Squared Error for Polynomial Regression:', mean_squared_error(y_test, predictions_poly))
Mean Squared Error for Polynomial Regression: 0.6193710915884316
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Combine features and target for the training set
train_data = pd.concat([X_train, y_train], axis=1)
# Create a pair plot
sns.pairplot(train_data, vars=X.columns, markers='o', diag_kind='kde')
plt.suptitle("Pair Plot of Features and Target", y=1.02)
plt.show()
from sklearn.tree import DecisionTreeRegressor
# Decision Tree Regression
tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X_train, y_train)
# Predictions and evaluation
tree_predictions = tree_model.predict(X_test)
print('Mean Squared Error for Decision Tree:', mean_squared_error(y_test, tree_predictions))
Mean Squared Error for Decision Tree: 0.6602040816326531
from sklearn.ensemble import RandomForestRegressor
# Random Forest Regression
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
# Predictions and evaluation
rf_predictions = rf_model.predict(X_test)
print('Mean Squared Error for Random Forest:', mean_squared_error(y_test, rf_predictions))
Mean Squared Error for Random Forest: 0.34775581632653063
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
# Converting the problem into a classification (optional: define thresholds for wine quality)
# For simplicity, we are using it as is.
# Splitting the data (classification)
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X, y, test_size=0.2, random_state=42)
# Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_class, y_train_class)
# Predictions and evaluation
class_predictions = rf_classifier.predict(X_test_class)
print(classification_report(y_test_class, class_predictions))
precision recall f1-score support
3 0.00 0.00 0.00 5
4 0.60 0.24 0.34 25
5 0.70 0.69 0.70 291
6 0.66 0.79 0.72 432
7 0.76 0.58 0.66 192
8 0.80 0.46 0.58 35
accuracy 0.69 980
macro avg 0.59 0.46 0.50 980
weighted avg 0.69 0.69 0.68 980
C:\Users\BharathThatipamula\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) C:\Users\BharathThatipamula\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) C:\Users\BharathThatipamula\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))